#!/usr/local/python3/bin/python3
import json
import datetime
from elasticsearch import Elasticsearch
from elasticsearch import helpers
import time

es_index = 'ipabstract'
es_type = 'patent'

start = 1
inputJsonPath = "F:/专利数据/2014/20140401_20140430.json"

es = Elasticsearch(hosts=[{'host': "192.168.1.167", 'port': "9200"}])


def convertDate(date):
    date_str = date.replace('T', ' ')[0:19]
    return datetime.datetime.strptime(date_str, '%Y-%m-%d %H:%M:%S')


s = time.time()
idx = 0
file = open(inputJsonPath, encoding="utf-8")
line = file.readline()
patentAbstractList = []
while line:
    idx = idx + 1
    if idx >= start:
        data = json.loads(line)
        source = {"patentAbstract": data['patentAbstract'], "pid": data['pid'],
                  "publishDate": convertDate(data['publishDate']['$date'])}
        patentAbstract = {"_index": es_index, "_type": es_type, "_id": data['_id']['$oid'], "_source": source}
        patentAbstractList.append(patentAbstract)
        if len(patentAbstractList) == 10000:
            helpers.bulk(es, patentAbstractList)
            patentAbstractList.clear()
    line = file.readline()

if len(patentAbstractList) > 0:
    helpers.bulk(es, patentAbstractList)
    patentAbstractList.clear()

e = time.time()
print('耗时：%s' % (e - s))
